Top K frequent words [Quick Select, Heap, Bucket Sort]

Time: O(N + KLogK) oN average; Space: O(N); medium

Given a non-empty list of words, return the k most frequent elements.

Your answer should be sorted by frequency from highest to lowest. If two words have the same frequency, then the word with the lower alphabetical order comes first.

Example 1:

Input: [“i”, “love”, “leetcode”, “i”, “love”, “coding”], k = 2

Output: [“i”, “love”]

Explanation:

  • “i” and “love” are the two most frequent words.

  • Note that “i” comes before “love” due to a lower alphabetical order.

Example 2:

Input: [“the”, “day”, “is”, “sunny”, “the”, “the”, “the”, “sunny”, “is”, “is”], k = 4

Output: [“the”, “is”, “sunny”, “day”]

Explanation:

  • “the”, “is”, “sunny” and “day” are the four most frequent words, with the number of occurrence being 4, 3, 2 and 1 respectively.

Notes:

  • You may assume k is always valid, 1 ≤ k ≤ number of unique elements.

  • Input words contain only lowercase letters.

Follow up:

  1. Try to solve it in O(n log k) time and O(n) extra space.

  2. Can you solve it in O(n) time with only O(k) extra space?

[1]:
from random import randint
import collections

class Solution1(object):
    def topKFrequent(self, words, k):
        """
        :type words: List[str]
        :type k: int
        :rtype: List[str]
        Quick Select Solution
        """
        counts = collections.Counter(words)
        p = []
        for key, val in counts.items():
            p.append((-val, key))
        self.kthElement(p, k);

        result = []
        sorted_p = sorted(p[:k])
        for i in range(k):
            result.append(sorted_p[i][1])
        return result

    def kthElement(self, nums, k):  # O(n) on average
        def PartitionAroundPivot(left, right, pivot_idx, nums):
            pivot_value = nums[pivot_idx]
            new_pivot_idx = left
            nums[pivot_idx], nums[right] = nums[right], nums[pivot_idx]
            for i in range(left, right):
                if nums[i] < pivot_value:
                    nums[i], nums[new_pivot_idx] = nums[new_pivot_idx], nums[i]
                    new_pivot_idx += 1

            nums[right], nums[new_pivot_idx] = nums[new_pivot_idx], nums[right]
            return new_pivot_idx

        left, right = 0, len(nums) - 1
        while left <= right:
            pivot_idx = randint(left, right)
            new_pivot_idx = PartitionAroundPivot(left, right, pivot_idx, nums)
            if new_pivot_idx == k - 1:
                return
            elif new_pivot_idx > k - 1:
                right = new_pivot_idx - 1
            else:  # new_pivot_idx < k - 1.
                left = new_pivot_idx + 1
[2]:
s = Solution1()
words = ["i", "love", "leetcode", "i", "love", "coding"]
k = 2
assert s.topKFrequent(words, k) == ['i', 'love']

words = ["the", "day", "is", "sunny", "the", "the", "the", "sunny", "is", "is"]
k = 4
assert s.topKFrequent(words, k) == ['the', 'is', 'sunny', 'day']
[3]:
import collections
import heapq

class Solution2(object):
    def topKFrequent(self, words, k):
        """
        :type words: List[str]
        :type k: int
        :rtype: List[str]
        Heap Solution
        Time: O(Nlogk); Space: O(N)
        """
        class MinHeapObj(object):
            def __init__(self,val): self.val = val
            def __lt__(self,other):
                return self.val[1] > other.val[1] if self.val[0] == other.val[0] else \
                       self.val < other.val
            def __eq__(self,other): return self.val == other.val
            def __str__(self): return str(self.val)

        counts = collections.Counter(words)
        min_heap = []
        for word, count in counts.items():
            heapq.heappush(min_heap, MinHeapObj((count, word)))
            if len(min_heap) == k + 1:
                heapq.heappop(min_heap)
        result = []
        while min_heap:
            result.append(heapq.heappop(min_heap).val[1])

        return result[::-1]
[4]:
s = Solution2()
words = ["i", "love", "leetcode", "i", "love", "coding"]
k = 2
assert s.topKFrequent(words, k) == ['i', 'love']

words = ["the", "day", "is", "sunny", "the", "the", "the", "sunny", "is", "is"]
k = 4
assert s.topKFrequent(words, k) == ['the', 'is', 'sunny', 'day']
[5]:
import collections

class Solution3(object):
    def topKFrequent(self, words, k):
        """
        :type words: List[str]
        :type k: int
        :rtype: List[str]
        Bucket Sort Solution
        Time: O(N + klogk) ~ O(N + NlogN); Space: O(N)
        """
        counts = collections.Counter(words)
        buckets = [[] for _ in range(len(words) + 1)]
        for word, count in counts.items():
            buckets[count].append(word)
        pairs = []
        for i in reversed(range(len(words))):
            for word in buckets[i]:
                pairs.append((-i, word))
            if len(pairs) >= k:
                break
        pairs.sort()

        return [pair[1] for pair in pairs[:k]]
[6]:
s = Solution3()
words = ["i", "love", "leetcode", "i", "love", "coding"]
k = 2
assert s.topKFrequent(words, k) == ['i', 'love']

words = ["the", "day", "is", "sunny", "the", "the", "the", "sunny", "is", "is"]
k = 4
assert s.topKFrequent(words, k) == ['the', 'is', 'sunny', 'day']